library(ggplot2); # For some graphs
library(FactoMineR); # For PCA
library(factoextra); # For PCA
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(caret); # For feature plots
## Loading required package: lattice
library(e1071); # For SVM
library(nnet); # For Neural nets
library(pROC);
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(cluster) # For k-means
library(fpc) # For k-means
library(seriation) # For k-means
## Registered S3 methods overwritten by 'registry':
## method from
## print.registry_field proxy
## print.registry_entry proxy
##
## Attaching package: 'seriation'
## The following object is masked from 'package:lattice':
##
## panel.lines
library(ggpubr) # For clustering visualization
library(TSclust) # For clustering Evaluation
## Loading required package: pdc
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(knitr); # For table printing
# Create a vector containing all attribute names
attribute.names = c()
for(pixel.number in 1:9) {
for(attribute.suffix in c("Red", "Green", "NIR1", "NIR2")) {
attribute.name = paste("p", pixel.number, ".", attribute.suffix, sep="")
attribute.names = c(attribute.names, attribute.name)
}
}
attribute.names = c(attribute.names, "class")
# Create a vector containing all class names
class.names = c("red soil", "cotton crop", "grey soil", "damp grey soil", "soil with vegetation stubble", "very damp grey soil")
# Load the dataset from the csv file and cast the label to factor
dataset = data.frame(read.csv("dataset.csv", col.names = attribute.names, sep=" "))
dataset$class = factor(dataset$class, labels = class.names)
# Determine the input and the output space
dataset.input = dataset[, 1:ncol(dataset)-1]
dataset.output = dataset$class
# Determine some statistical measures on the input attributes
dataset.mean = sapply(dataset.input, mean)
dataset.sd = sapply(dataset.input, sd)
dataset.var = sapply(dataset.input, var)
dataset.min = sapply(dataset.input, min)
dataset.max = sapply(dataset.input, max)
dataset.median = sapply(dataset.input, median)
dataset.range = sapply(dataset.input, range)
dataset.quantile = sapply(dataset.input, quantile)
# Determine class frequencies
class.frequencies = data.frame(table(dataset.output))
colnames(class.frequencies) = c("class", "frequencies")
class.frequencies$percentages = paste(format((class.frequencies$frequencies / sum(class.frequencies$frequencies)) * 100, digits = 2), "%", sep = "")
ggplot(data = class.frequencies, aes(x = class, y = frequencies, fill = class)) + geom_bar(stat = "identity") + geom_text(aes(label = percentages), vjust = 1.5, colour = "white")
ggplot(data = class.frequencies, aes(x = "", y = frequencies, fill = class)) + geom_bar(stat = "identity", width = 1, color = "white") + coord_polar("y", start = 0)
## Separabilità I valori relativi allo stesso canale tra pixel adiacenti risultano correlati (come evidente dai grafici), questo perche’ il dataset e’ costruito come una sliding window quadrata 3x3 su una singola immagine satellitare. Ci si aspetta quindi una varianza piuttosto contenuta sui singoli valori e la sopra citata correlazione.
TODO: Aggiungere titoli ai featureplot
featurePlot(dataset.input[, c("p5.Red", "p5.Green", "p5.NIR1")], dataset.output, plot="pairs", scales=list((relation="free"), y=list(relation="free")), auto.key=list(columns=3))
featurePlot(dataset.input[, c("p5.Red", "p6.Red", "p4.Red")], dataset.output, plot="pairs", scales=list((relation="free"), y=list(relation="free")), auto.key=list(columns=3))
featurePlot(dataset.input[, c("p5.Green", "p6.Green", "p4.Green")], dataset.output, plot="pairs", scales=list((relation="free"), y=list(relation="free")), auto.key=list(columns=3))
featurePlot(dataset.input[, c("p5.Green", "p2.Green", "p8.Green")], dataset.output, plot="pairs", scales=list((relation="free"), y=list(relation="free")), auto.key=list(columns=3))
featurePlot(dataset.input[, c("p5.Red", "p2.Red", "p8.Red")], dataset.output, plot="pairs", scales=list((relation="free"), y=list(relation="free")), auto.key=list(columns=3))
featurePlot(dataset.input[, c("p5.NIR1", "p2.NIR1", "p8.NIR1")], dataset.output, plot="pairs", scales=list((relation="free"), y=list(relation="free")), auto.key=list(columns=3))
Data la correlazione rilevata precedentemente e l’alto numero di attributi in input sembra più che opportuno eseguire una PCA sul dataset al fine di ridurre la dimensione dello spazio di input.
# Execute the PCA. Extract only the first four principal components
pca.results <- PCA(dataset.input, scale.unit = TRUE, ncp = 4, graph = FALSE)
# Get the eigenvalues and plot them. The first four components explain about 92% of variance
pca.results.eig.val <- get_eigenvalue(pca.results)
fviz_eig(pca.results, addlabels = TRUE, ylim = c(0, 50))
# Show the correlation between the variables using the first two principal components
# Many variable are positively correlated
fviz_pca_var(pca.results, col.var = "black")
Analizzando le coordinate per i singoli attributi si riesce a capire i 4 cluster rappresentano i 4 tipi di variabili in gioco (Red, Green NIR1, NIR2)
# Extract the principal components
dataset.input.pca = data.frame(get_pca_ind(pca.results)$coord)
dataset.pca = cbind(class=dataset$class, dataset.input.pca)
# Draw some feature plot to highlight the level of difficulty to recognise the various classes
featurePlot(x=dataset.input.pca, y=dataset.output, plot="density", scales=list(x=list(relation="free"), y=list(relation="free")), auto.key=list(columns=3))
featurePlot(x=dataset.input.pca, y=dataset.output, plot="pairs", auto.key=list(columns=3))
# Divide the dataset into trainset and testset
indexes = sample(2, nrow(dataset.pca), replace = TRUE, prob = c(0.7, 0.3))
temp.trainset = dataset.pca[indexes == 1,]
testset = dataset.pca[indexes == 2,]
testset.x = testset[, !names(testset) %in% c("class")]
testset.y = testset[,c("class")]
# Divide the trainset to obtain a validationset
indexes = sample(2, nrow(temp.trainset), replace = TRUE, prob = c(0.9, 0.1))
trainset = temp.trainset[indexes == 1,]
validationset = temp.trainset[indexes == 2,]
validationset.x = validationset[,!names(validationset) %in% c("class")]
validationset.y = validationset[,c("class")]
svm.tuning.control = tune.control(sampling = "fix")
svm.tuning = tune.svm(class ~ ., data = trainset, kernel = "radial", gamma = 10^(-3:1), cost = 10^(-2:1), validation.x = validationset[,!names(validationset) %in% c("class")], validation.y = validationset[,c("class")], tunecontrol = svm.tuning.control, probability = T)
plot(svm.tuning)
svm.tuned = svm.tuning$best.model
svm.tuned.validation.prediction = predict(svm.tuned, validationset.x)
svm.tuned.validation.confusion_matrix = table(svm.tuned.validation.prediction, validationset.y)
svm.tuned.accuracy = sum(diag(svm.tuned.validation.confusion_matrix)) / sum(svm.tuned.validation.confusion_matrix)
sprintf("Tuned SVM accuracy is %2.1f%%", (1 - svm.tuning$best.performance) * 100)
## [1] "Tuned SVM accuracy is 89.8%"
sprintf("Best SVM parameters were gamma: %d cost: %d", svm.tuning$best.parameters$gamma, svm.tuning$best.parameters$cost)
## [1] "Best SVM parameters were gamma: 1 cost: 1"
print(svm.tuned.validation.confusion_matrix)
## validationset.y
## svm.tuned.validation.prediction red soil cotton crop grey soil damp grey soil
## red soil 99 0 0 0
## cotton crop 0 51 0 1
## grey soil 0 0 95 8
## damp grey soil 0 0 2 25
## soil with vegetation stubble 0 2 0 1
## very damp grey soil 0 1 2 11
## validationset.y
## svm.tuned.validation.prediction soil with vegetation stubble
## red soil 3
## cotton crop 3
## grey soil 0
## damp grey soil 0
## soil with vegetation stubble 45
## very damp grey soil 4
## validationset.y
## svm.tuned.validation.prediction very damp grey soil
## red soil 0
## cotton crop 0
## grey soil 2
## damp grey soil 4
## soil with vegetation stubble 3
## very damp grey soil 98
nnet.tuning.control = tune.control(sampling = 'fix', performances = T)
nnet.tuning = tune.nnet(class ~., data = trainset, size = c(2:4), validation.x = validationset.x, validation.y = validationset.y, decay = c(0, 2^-2:1))
nnet.tuned = nnet.tuning$best.model
plot(nnet.tuning)
acc = 1 - nnet.tuning$best.performance
sprintf("Tuned neuralnet overall accuracy is %2.1f%%", acc * 100)
## [1] "Tuned neuralnet overall accuracy is 85.9%"
sprintf("Tuned neuralnet best parameters were size: %d decay: %d", nnet.tuning$best.parameters$size, nnet.tuning$best.parameters$decay)
## [1] "Tuned neuralnet best parameters were size: 4 decay: 0"
predictions = predict(nnet.tuning$best.model, testset.x)
predictions.classes = class.names[max.col(predictions)]
errors = predictions.classes != testset.y
acc = length(which(errors))/length(errors)
nnet.tuned.validation.predictions = predictions.classes
nnet.tuned.validation.confusion_matrix = table(nnet.tuned.validation.predictions, testset.y)
print(nnet.tuned.validation.confusion_matrix)
## testset.y
## nnet.tuned.validation.predictions red soil cotton crop grey soil damp grey soil
## cotton crop 0 188 0 0
## damp grey soil 2 1 24 51
## grey soil 8 0 394 40
## red soil 430 0 0 3
## soil with vegetation stubble 7 14 1 13
## very damp grey soil 0 1 4 67
## testset.y
## nnet.tuned.validation.predictions soil with vegetation stubble
## cotton crop 8
## damp grey soil 4
## grey soil 1
## red soil 5
## soil with vegetation stubble 158
## very damp grey soil 17
## testset.y
## nnet.tuned.validation.predictions very damp grey soil
## cotton crop 0
## damp grey soil 54
## grey soil 15
## red soil 0
## soil with vegetation stubble 22
## very damp grey soil 397
svm.tuned.validation.predictions = predict(svm.tuned, validationset.x, probability = T)
svm.tuned.validation.predictions.probs = attr(svm.tuned.validation.predictions, "probabilities")
svm.roc = multiclass.roc(validationset.y, svm.tuned.validation.predictions.probs)
nnet.tuned.validation.predictions = predict(nnet.tuned, validationset.x, probability = T)
nnet.roc = multiclass.roc(validationset.y, nnet.tuned.validation.predictions)
sprintf("SVM ROC AUC: %f", svm.roc$auc)
## [1] "SVM ROC AUC: 0.980260"
sprintf("NNET ROC AUC: %f", nnet.roc$auc)
## [1] "NNET ROC AUC: 0.972077"
source("compute_f_scores.R")
weights = list()
for (cls in class.names){
count = class.frequencies[class.frequencies$class == cls,"frequencies"]
weights[[cls]] = count
}
total_instances_count = Reduce('+', weights)
weights = mapply('/', weights, total_instances_count)
fscores = compute_f_scores(class.names, svm.tuned.validation.confusion_matrix, weights)
svm.macro.fscore = Reduce('+', fscores$fscores) / length(class.names)
svm.micro.fscore = Reduce('+', fscores$fscores.weighted)
sprintf("SVM macro F-Score: %1.3f", svm.macro.fscore)
## [1] "SVM macro F-Score: 0.872"
sprintf("SVM micro F-Score: %1.3f", svm.micro.fscore)
## [1] "SVM micro F-Score: 0.896"
per_class_metrics = data.frame("Precision" = unname(unlist(fscores$precisions)), "Recall" = unname(unlist(fscores$recalls)), "Fscore" = unname(unlist(fscores$fscores)))
rownames(per_class_metrics) = names(fscores$precisions)
per_class_metrics$Precision = per_class_metrics$Precision * 100
per_class_metrics$Recall = per_class_metrics$Recall * 100
print(per_class_metrics)
## Precision Recall Fscore
## red soil 97.05882 100.00000 0.9850746
## cotton crop 92.72727 94.44444 0.9357798
## grey soil 90.47619 95.95960 0.9313725
## damp grey soil 80.64516 54.34783 0.6493506
## soil with vegetation stubble 88.23529 81.81818 0.8490566
## very damp grey soil 84.48276 91.58879 0.8789238
fscores = compute_f_scores(class.names, nnet.tuned.validation.confusion_matrix, weights)
nnet.macro.fscore = Reduce('+', fscores$fscores) / length(class.names)
nnet.micro.fscore = Reduce('+', fscores$fscores.weighted)
sprintf("NNet macro F-Score: %1.3f", nnet.macro.fscore)
## [1] "NNet macro F-Score: 0.787"
sprintf("Net micro F-Score: %1.3f", nnet.micro.fscore)
## [1] "Net micro F-Score: 0.831"
per_class_metrics = data.frame("Precision" = unname(unlist(fscores$precisions)), "Recall" = unname(unlist(fscores$recalls)), "Fscore" = unname(unlist(fscores$fscores)))
rownames(per_class_metrics) = names(fscores$precisions)
per_class_metrics$Precision = per_class_metrics$Precision * 100
per_class_metrics$Recall = per_class_metrics$Recall * 100
print(per_class_metrics)
## Precision Recall Fscore
## red soil 98.17352 96.19687 0.9717514
## cotton crop 95.91837 92.15686 0.9400000
## grey soil 86.02620 93.14421 0.8944381
## damp grey soil 37.50000 29.31034 0.3290323
## soil with vegetation stubble 73.48837 81.86528 0.7745098
## very damp grey soil 81.68724 81.35246 0.8151951